Clear all variables before starting

rm(list=ls())

Load required libraries

library(text2vec)
library(data.table)
library(magrittr)
library(sentimentr)
library(glmnet)
library(vip)

Data preparation

Here, we load the train and test data into their respective datasets. Additionally, we make use of the crowdflower_weather dataset available after loading the sentimentr package. We first have to convert the crowdflower_weather dataset’s sentiments to a similar scale as our dataset (-1 -> 1, 0 -> 2, 1 -> 3). After which, we have an expanded training dataset.

twitter_train <- read.csv("train.csv", stringsAsFactors=FALSE)

#data(crowdflower_weather)
cfw <- data.frame(crowdflower_weather, stringsAsFactors=FALSE)
cfw <- cfw[, c("text","sentiment")]

cfw$sentiment <- cfw$sentiment + 2
names(cfw)[names(cfw) == "text"] <- "tweet"

twitter_test <- read.csv('test.csv', stringsAsFactors = FALSE)
twitter_test$sentiment <- 0

twitter_train <- rbind(twitter_train, cfw)

For the tokenization we are required to include the training dataset ids and assign them here.

for(i in 1:nrow(twitter_train)){
  twitter_train$ID[i] <- i
}

Pre-processing

Next, we have our data pre-processing and tokenization steps.

prep_fun = tolower
tok_fun = word_tokenizer

it_train = itoken(twitter_train$tweet, preprocessor = prep_fun, tokenizer = tok_fun, ids = twitter_train$ID, progressbar = FALSE)

vocab = create_vocabulary(it_train, ngram = c(1L, 2L))
vocab = prune_vocabulary(vocab, term_count_min = 10, doc_proportion_max = 0.5)

it_test = tok_fun(prep_fun(twitter_test$tweet))
it_test = itoken(it_test, ids = twitter_test$ID, progressbar = FALSE)

bigram_vectorizer = vocab_vectorizer(vocab)
# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_train = create_dtm(it_train, bigram_vectorizer)
# tfidf modified by fit_transform() call!
dtm_train_tfidf = fit_transform(dtm_train, tfidf)

# apply pre-trained tf-idf transformation to test data
dtm_test_tfidf = create_dtm(it_test, bigram_vectorizer)
dtm_test_tfidf = transform(dtm_test_tfidf, tfidf)

Train a classifier

Best Classifier: Cross-Validation Glmnet

After running a k-fold cross-validation with k=10, we find the optimal lambda to tune our model later on.

set.seed(6)
k = 10
glmnet_classifier = cv.glmnet(x = dtm_train_tfidf, y = twitter_train[['sentiment']], family = 'multinomial', alpha = 1, type.measure = "class", nfolds = k)

# graph of cross validated mean error against lambda
plot(glmnet_classifier$lambda,glmnet_classifier$cvm)

# smallest cvm
glmnet_classifier$cvm[glmnet_classifier$lambda==glmnet_classifier$lambda.min]
## [1] 0.1043288
# find optimal lambda
glmnet_classifier$lambda.min
## [1] 0.002376492
# variable importance of model
vip(glmnet_classifier)

Make predictions on the test dataset

To make predictions on the test dataset, we use the predict function and set s to be the optimal lambda of 0.002376492 and type as “response”. After which, we add these predictions to the test dataset by selecting the sentiment with the maximum probability. Finally, we prepare the script for submission.

preds = predict(glmnet_classifier, dtm_test_tfidf, s = 0.002376492, type = 'response')

for(i in 1:nrow(preds)){
  twitter_test$sentiment[i] <- as.character(which.max(preds[i,,]))
}
#head(twitter_test)

twitter_submission <- cbind(as.numeric(twitter_test$id), as.numeric(twitter_test$sentiment))
colnames(twitter_submission) <- c("id", "sentiment")

#write.csv(twitter_submission,'SubmissionCVGlmnet.csv',row.names = FALSE)